package com.trywildcard.pair.util; import com.trywildcard.pair.util.HttpAgent; import org.htmlcleaner.CleanerProperties; import org.htmlcleaner.DomSerializer; import org.htmlcleaner.HtmlCleaner; import org.htmlcleaner.TagNode; import org.w3c.dom.Document; import javax.xml.parsers.ParserConfigurationException; import java.io.IOException; import java.net.URISyntaxException; import java.net.URL; /** * Created by karthiksenthil on 11/1/14. */ public class HtmlParserUtil { public static Document getHtmlDocumentModel(String htmlContent) { try { TagNode tagNode = new HtmlCleaner().clean(htmlContent); Document doc; try { doc = new DomSerializer(new CleanerProperties()).createDOM(tagNode); } catch (ParserConfigurationException e) { throw new RuntimeException(e); } return doc; } catch (RuntimeException rte) { return null; } } }